rm(list =ls())
options(scipen=999)
gc()
packages <-c("tidyverse","estimatr","plm","stargazer","fastDummies","readstata13","ihs")

new.packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

lapply(packages, require, character.only = TRUE)
rm(packages, new.packages)

setwd("Put working directory here")

## insheet data
data <- read.csv("./Datasets/panel_bare_bones.csv")

## some cleaning
data$X <- NULL
data$locality_year <- NULL
data$subbotnik_inkind <- NULL
data$locality <- as.character(tolower(data$locality))

## specify control variables to be loaded in
controls <- c("locality", "region", "pop", "schools", "shops", "restaurants", "cinemas",
              "Frac48", "coal_sqm", "coal_mines", "minerals", "partition", "priests_continuous", 
              "pop", "protests_40s", "sabotage_40s","terror_40s", "income_76_capita", 
              "income_77_capita", "commanders1945", "commanders1946", "commanders1947",
              "commanders1948")

## merge controls 
data <- merge(data, read.csv("./Datasets/main_confounders.csv")[,controls], by = c("locality"), all.x = T)
data$region_numeric <- as.numeric(as.factor(data$region))
rm(controls)

## add Prussian census variables
data <- merge(data, read.csv("./Datasets/prussia_clean.csv"), by = c("locality"), all.x = T)

## small cleaning
data$X <- NULL
data$partition <- ifelse(data$partition=="Russian",1,0)


#######################################
### Clean pre-treatment variables #####
#######################################

## aggregate panel data, take mean (because controls are not time varying)
data_t <- aggregate(data, by=list(Category=data$locality_numeric), FUN=mean, na.rm=T)

# wealth index
data_t$shops <- scale(data_t$shops)
data_t$restaurants <- scale(data_t$restaurants)
data_t$cinemas <- scale(data_t$cinemas)
data_t$wealth_index <- rowSums(data_t[,c("shops","restaurants","cinemas")], na.rm = T)
head(data_t[,c("shops", "restaurants", "cinemas", "wealth_index")], 50)
data_t$wealth_index[data_t$wealth_index == "NaN"] = NA  
data_t$wealth <- scale(data_t$wealth_index)

# state capacity
data_t$state_capacity <- data_t$schools

# ethnic diversity
data_t$ethnic_diversity <- data_t$Frac48

# Russian occupation
data_t$Russian_occupation <- data_t$partition

# industrialization
data_t$industrialization <- data_t$minerals

# incomes
data_t$income_76_capita <- scale(data_t$income_76_capita)
data_t$income_77_capita <- scale(data_t$income_77_capita)
data_t$income <- rowSums(data_t[,c("income_76_capita", "income_77_capita")], na.rm = T)
data_t$income <- scale(data_t$income)

# capture all pre-treatment variables (not post-treatment!)
confounders <- c("Russian_occupation", "industrialization", "coal_sqm", "protests_40s",
                 "sabotage_40s", "terror_40s", "jew_perc", "pop_male_perc", "pop_female_perc",
                 "protestant_perc", "catholic_perc", "other_christ_perc", "jew_perc", 
                 "other_relig_perc", "age_under_ten_perc", "literate_perc", "school_noinfo_perc",
                 "illiterate_perc", "commanders1945", "commanders1946", "commanders1947", "commanders1948")

# mean imputation for missings
impute_mean = function(x) {coalesce(x, mean(x, na.rm = TRUE))}
data_t <- data_t %>% mutate_at(vars(confounders), impute_mean)

# scale all confounders
data_t <- data_t %>% mutate_at(confounders, funs(c(scale(.))))



##################
### RUN MODELS ###
##################

# make "treatment" binary to facilitate interpretation
data_t$priest <- ifelse(data_t$priests_continuous>0,1,0)

# rename vars for stagazer
colnames(data_t) <- gsub("_", "", colnames(data_t))

confounders <- c("Russianoccupation", "industrialization", "coalsqm", "protests40s",
                 "sabotage40s", "terror40s", "jewperc", "popmaleperc", "popfemaleperc",
                 "protestantperc", "catholicperc", "otherchristperc", "jewperc", 
                 "otherreligperc", "ageundertenperc", "literateperc", "schoolnoinfoperc",
                 "illiterateperc", "commanders1945", "commanders1946", "commanders1947", "commanders1948")

m_selection <- lm(paste("priest ~ ", paste(confounders, collapse=" + ")), data = data_t)
summary(m_selection)


###################
### Output data ###
###################

stargazer(m_selection,
          dep.var.labels = c("Corrupt priests"),
          style = "qje",
          covariate.labels = c("Russian occupation", 
                               "Industrialization", 
                               "Coal deposits",
                               "Protests (40s)",
                               "Sabotage (40s)",
                               "Terror (40s)",
                               "Jews perc (1871)",
                               "Men perc (1871)",
                               "Women perc (1871)",
                               "Protestants perc (1871)",
                               "Catholics perc (1871)",
                               "Other Christian perc (1871)",
                               "Other religion perc (1871)",
                               "Age under 10 perc (1871)",
                               "Literate perc (1871)",
                               "No school perc (1871)",
                               "Illiterate perc (1871)",
                               "Secret police officers (1945)",
                               "Secret police officers (1946)",
                               "Secret police officers (1947)",
                               "Secret police officers (1948)"),                               
          star.char = c("*", "**", "***"),
          star.cutoffs = c(0.05, 0.01, 0.001),
          omit = c("Constant", "factor"),
          omit.stat = c("rsq", "f", "ser"), 
          omit.table.layout = "n"
          out = "PUT YOUR FILEPATH HERE")



